# vrsPigmentsE v.0
# A combination of chl-a and cyanobacterial reconstructions
# To be run using raw .xls files from VRS output

suppressPackageStartupMessages({
  library(pls)
  library(rioja)
  library(tidyverse)
  library(readxl)
  library(gridExtra)
  library(grid)
  library(pdftools)
})

buildCyanoCalibration <- function() {
  
  cat("Select Favot vrsCyanobacteria calibration data (.rds)")
  Calib <- read_rds(file.choose())
  
  NIRS <- as.matrix(Calib[, 2:1000])
  NIRSMSC <- msc(NIRS)
  NIRSframe <- data.frame(pig = Calib[, 1], NIRS = I(NIRS))
  NIRSframeMSC <- data.frame(pig = Calib[, 1], NIRS = I(NIRSMSC))
  
  pls.options(plsralg = "oscorespls")
  
  PLS_noMSC <- plsr(
    pig ~ NIRS,
    ncomp = 10,
    data = NIRSframe,
    validation = "CV",
    segments = 10
  )
  PLS_MSC <- plsr(
    pig ~ NIRS,
    ncomp = 10,
    data = NIRSframeMSC,
    validation = "CV",
    segments = 10
  )
  
  assign("CyanoCalib", Calib, envir = .GlobalEnv)
  assign("PLS_cyano_noMSC", PLS_noMSC, envir = .GlobalEnv)
  assign("PLS_cyano_MSC", PLS_MSC, envir = .GlobalEnv)
  
  message("✔ Cyanobacteria calibration loaded and model built")
}

vrsPigmentsE <- function() {
  
  ## ==================================================
  ## Calibration check
  ## ==================================================
  if (!exists("CyanoCalib", envir = .GlobalEnv)) {
    stop("Calibration not found. Run buildCyanoCalibration() first.")
  }
  
  ## ==================================================
  ## Load data + midpoint parsing
  ## ==================================================
  cat("Select raw spectral (.xlsx) data:")
  data.file <- file.choose()
  setwd(dirname(data.file))
  data <- read_excel(data.file, sheet = 1)
  data.title <- sub("\\.xls$", "", basename(data.file))
  
  chla.data <- data.frame()
  
  for (i in 2:ncol(data)) {
    if (grepl("-", data[2, i])) {
      a <- str_split_fixed(data[2, i], "-", 2)
      b <- numeric(2)
      b[1] <- as.numeric(a[1])
      b[2] <- as.numeric(a[2])
      chla.data[i - 1, 1] <- mean(b)
    } else {
      chla.data[i - 1, 1] <- as.numeric(data[2, i])
    }
  }
  
  ## ==================================================
  ## Midpoint error detection
  ## ==================================================
  NAVect <- integer(0)
  for (i in seq_len(nrow(chla.data))) {
    if (is.na(chla.data[i, 1])) {
      NAVect <- append(NAVect, i)
    }
  }
  NAVect <- NAVect + 1
  
  if (length(NAVect) > 0) {
    
    num_to_letters <- function(n) {
      letters_vec <- LETTERS
      result <- ""
      while (n > 0) {
        n <- n - 1
        result <- paste0(letters_vec[(n %% 26) + 1], result)
        n <- n %/% 26
      }
      result
    }
    
    errorVect <- character(length(NAVect))
    for (i in seq_along(NAVect)) {
      errorVect[i] <- num_to_letters(NAVect[i])
    }
    
    errorMessageFinal <- paste(
      paste("There's an issue with the sample names in column(s):",
            paste(errorVect, collapse = ", "),
            "in the Excel sheet."),
      "Please go back to the spectra sheet and check the sample names (3rd row).",
      "Common errors include naming samples '21--21.5' instead of '21-21.5', '21*21.5' instead of '21-21.5'",
      sep = "\n\n"
    )
    
    cat(errorMessageFinal)
    stop()
  }
  
  ## ==================================================
  ## Chl-a inference
  ## ==================================================
  for (i in 1:26) {
    for (j in seq_len(nrow(chla.data))) {
      chla.data[j, i + 1] <- as.numeric(data[i + 127, j + 1])
    }
  }
  
  for (i in 1:27) {
    
    if (i == 1) next
    
    if (i == 27) {
      
      a <- numeric(nrow(chla.data))
      b <- numeric(nrow(chla.data))
      
      for (j in seq_len(nrow(chla.data))) {
        
        peak.area <-
          sum(chla.data[j, -1]) -
          ((chla.data[j, 2] * 51) +
             (((chla.data[j, 52] - chla.data[j, 2]) * 51) / 2))
        
        a[j] <- 0.0919 * peak.area + 0.0011
        b[j] <- exp(0.83784 * log(peak.area) - 2.48861)
      }
      
      chla.data <- add_column(chla.data, chla = a)
      chla.data <- add_column(chla.data, chla.LogTransform = b)
      
    } else {
      
      frontcol <- i * 2 - 2
      backcol  <- i * 2 - 1
      
      newcol <- (chla.data[, frontcol] + chla.data[, backcol]) / 2
      chla.data <- add_column(chla.data, newcol, .after = frontcol)
    }
  }
  
  colnames(chla.data) <- c("midpoint", 650:700, "chla DEPRECIATED", "chlaLOG")
  chla.data <- chla.data[order(chla.data$midpoint), ]
  chla.data$chlaLOGz <- scale(chla.data$chlaLOG)
  
  ## ==================================================
  ## Cyanobacteria inference
  ## ==================================================
  cyanoData <- data[-1, ]
  cyanoData <- as.data.frame(t(cyanoData))
  colnames(cyanoData) <- cyanoData[1, ]
  cyanoData <- cyanoData[-1, ]
  cyanoData$Sample <- chla.data$midpoint
  
  cyanoData <- cyanoData[, -(127:152), drop = FALSE]
  Pred <- cyanoData[, -(2:26), drop = FALSE]
  
  Pred <- as.matrix(data.frame(lapply(Pred, unlist)))
  storage.mode(Pred) <- "double"
  
  NIRSpred <- as.matrix(Pred[, 2:1000])
  NIRSpredMSC <- msc(NIRSpred)
  
  NIRSframepred <- data.frame(pig = Pred[, 1], NIRS = I(NIRSpred))
  NIRSframepredMSC <- data.frame(pig = Pred[, 1], NIRS = I(NIRSpredMSC))
  
  results <- data.frame(
    midpoint = chla.data$midpoint,
    no.MSC   = predict(PLS_cyano_noMSC, ncomp = 2, newdata = NIRSframepred),
    model.MSC = predict(PLS_cyano_MSC, ncomp = 2, newdata = NIRSframepred),
    full.MSC  = predict(PLS_cyano_MSC, ncomp = 2, newdata = NIRSframepredMSC),
    data.MSC  = predict(PLS_cyano_noMSC, ncomp = 2, newdata = NIRSframepredMSC)
  )
  
  colnames(results) <- c("midpoint", "no.MSC", "model.MSC", "full.MSC", "data.MSC")
  
  results$no.MSC.z    <- as.numeric(scale(results$no.MSC))
  results$model.MSC.z <- as.numeric(scale(results$model.MSC))
  results$full.MSC.z  <- as.numeric(scale(results$full.MSC))
  results$data.MSC.z  <- as.numeric(scale(results$data.MSC))
  
  ## ==================================================
  ## Plots
  ## ==================================================
  p_chla_log <- ggplot(chla.data, aes(chlaLOG, midpoint)) +
    geom_line(color = "seagreen", orientation = "y") +
    geom_point(size = 0.5) +
    ylab("Depth") +
    xlab(expression(paste("Concentration (mg"%.%"g"^{-1}, " dry mass)"))) +
    scale_y_reverse() +
    theme_classic() +
    ggtitle(bquote(.(data.title) ~ "chl-" * italic(a) ~ "(log)"))
  
  p_chla_z <- ggplot(chla.data, aes(chlaLOGz, midpoint)) +
    geom_line(color = "seagreen", orientation = "y") +
    geom_point(size = 0.5) +
    ylab("Depth") +
    xlab("z-score") +
    scale_y_reverse() +
    theme_classic() +
    ggtitle(bquote(.(data.title) ~ "chl-" * italic(a) ~ "(log)"))
  
  p_cyano <- ggplot(results, aes(no.MSC, midpoint)) +
    geom_line(color = "darkred", orientation = "y") +
    geom_point(size = 0.5) +
    ylab("Depth") +
    xlab(expression("Concentration ppt")) +
    scale_y_reverse() +
    theme_classic() +
    ggtitle(paste(data.title, "Cyanobacteria (No MSC)"))
  
  p_cyano_z <- ggplot(results, aes(no.MSC.z, midpoint)) +
    geom_line(color = "darkred", orientation = "y") +
    geom_point(size = 0.5) +
    ylab("Depth") +
    xlab(expression("z-score")) +
    scale_y_reverse() +
    theme_classic() +
    ggtitle(paste(data.title, "Cyanobacteria (No MSC)"))
  
  ## ==================================================
  ## Outputs
  ## ==================================================
  
  pdf(paste(data.title, "VRS Pigments Summary.pdf"), width = 8, height = 10,
      title = paste(data.title, "VRS Pigments Summary"))
  
  grid.arrange(
    p_chla_log, p_chla_z,
    p_cyano,   p_cyano_z,
    ncol = 2
  )
  
  grid.newpage()
  grid.text(
    "Notes:
    
  We recommend reporting all values as z-scores.
  We also recommend using the results from the No Multiplicative Scatter Correction 
  (No MSC) cyanobacteria model.
  NAs are produced in the chl-a output when peak area <0. This is considered the 
  Method Detection Limit.
  The ogirinal equation from Michelutti et al. (2010) is considered depreciated.
  A log-transformed version of the Michelutti et al. (2010) equation is now used in all
  chl-a VRS reconstructions: 
  
            chla + derivatives = exp(0.83784 * ln(peak area 650-700nm) - 2.48861)

  References:
  
  Michelutti N, Blais J, Cumming B, Paterson AM, Ruhland K, Wolfe AP, Smol JP. 2010.
  Do spectrally inferred determinations of chlorophyll a reflect trends in lake trophic status?
  Journal of Paleolimnology 43(2):205–217.
  https://doi.org/10.1007/s10933-009-9325-8

  Favot EJ, Hadley KR, Paterson AM, Michelutti N, Watson SB, Zastepa A, Hutchinson NJ,
  Vinebrooke RD, Smol JP. 2020. Using visible near-infrared reflectance spectroscopy (VNIRS) 
  of lake sediments to estimate historical changes in cyanobacterial production: potential 
  and challenges. Journal of Paleolimnology 64:335–345.
  https://doi.org/10.1007/s10933-020-00140-2

  Original code is maintained by EJ at PEARL/Queen's University.
  Please email groomsC@QueensU.ca for support.",
    x = unit(0.05, "npc"),     
    y = unit(0.95, "npc"),     
    just = c("left", "top"),
    gp = gpar(fontsize = 12)
  )
  
  
  dev.off()
  
  write.csv(chla.data, paste(data.title, "chla data.csv"))
  write.csv(results, paste(data.title, "cyanobacteria data.csv"))
  
  message("✔ Pigment inference complete")
}

# Run this first once, using the calibration sheet:
buildCyanoCalibration()

# Run this as many times as you want:
vrsPigmentsE()